# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Create figures and tables in supporting materials
#' @author Hauke Licht
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----

library(readr)
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(patchwork)
library(plm)
library(lmtest)

base_path <- file.path(".")
data_path <- file.path(base_path, "data", "output")
helpers_path <- file.path(base_path, "code", "helpers")

source(file.path(helpers_path, "plot_setup.R"))
fig_path <- file.path(base_path, "results", "figures")
dir.create(fig_path, showWarnings = FALSE, recursive = TRUE)

source(file.path(helpers_path, "table_setup.R"))
tables_path <- file.path(base_path, "results", "tables")
dir.create(tables_path, showWarnings = FALSE, recursive = TRUE)

paper_objects <- read_rds(file.path(data_path, "paper_objects.rds"))

# SECTION A ----

## Table S1 ----

paper_objects$data$countries %>% 
  quick_kable(
    caption = "Country coverage. Alpha-2 and alpha-3 country codes according to ISO 3166-1."
    , col.names = c("Country name", "alpha-2", "alpha-3")
    , label = "countries"
    , align = c("l", "c", "c")
  ) %>% 
  column_spec(2:3, monospace = TRUE) %>% 
  add_header_above(c(" " = 1, "Country code" = 2)) %>% 
  write_kable(.file.name = "sm-tableS01", overwrite = TRUE, dir = tables_path)

## Table S2 ----

paper_objects$tables$parties %>% 
  ungroup() %>% 
  select(-periods) %>% 
  mutate(
    # see https://stackoverflow.com/a/56924385
    screen_name = sprintf(
      "\\href{https://twitter.com/i/user/%s}{\\texttt{@%s}}%s"
      , user_id
      , gsub("_", "\\\\_", screen_name)
      , ifelse(tweets_added_in_2023, "\\,*", "")
    )
    , user_id = NULL
    , tweets_added_in_2023 = NULL
  ) %>% 
  quick_kable(
    caption = paste(
      "Parties included in our dataset."
      , "Party abbreviations, names and IDs based on \\citet{doring_parliaments_2019}."
      , "Twitter account names link to Twitter pages (using accounts' user IDs)."
      , "Twitter accounts marked with an asteriks (*) flag accounts we have updated in May and Jule 2023."
      , "The last column reports the number tweets recorded."
      , collapse = " "
    )
    , label = "parties"
    , col.names = c("Country", "Party Abbr.", "Party Name", "Party ID", "Twitter account", "$N$ tweets")
    , longtable = TRUE
    , escape = FALSE
  ) %>% 
  column_spec(3, width = "1.8in") %>%
  collapse_rows(1:4, latex_hline = "major", valign = "top") %>% #, row_group_label_position = "stack") %>% 
  write_kable(.file.name = "sm-tableS02", overwrite = TRUE, dir = tables_path)

## Table S3 ----

paper_objects$tables$new_party_accounts %>% 
  transmute(
    country_iso3c
    , party_name_short
    , party_id
    # see https://stackoverflow.com/a/56924385
    , screen_name = sprintf(
      "\\href{https://twitter.com/i/user/%s}{\\texttt{@%s}}"
      , user_id
      , gsub("_", "\\\\_", screen_name)
    )
    , action = sub(" '([^\\)]+)'", " \\\\texttt{@\\1}", action, perl = TRUE)
  ) %>% 
  mutate(action = gsub("_", "\\\\_", action)) %>% 
  quick_kable(
    caption = paste(
      "Parties accounts researched and added in May and June 2023."
      , "Party abbreviations and IDs based on \\citet{doring_parliaments_2019}."
      , "Twitter account names link to Twitter pages (using accounts' user IDs)."
      , "Comment ``collected (not added)'' means that we have found this account"
      , "but none of its tweets were posted during the parliamentary gconfigurations for which we record entries for the given party."
      , collapse = " "
    )
    , label = "added_parties"
    , col.names = c("Country", "Party Abbr.", "Party ID", "Twitter account", "Comment")
    , escape = FALSE
  ) %>% 
  write_kable(.file.name = "sm-tableS03", overwrite = TRUE, dir = tables_path)

# SECTION C ----

label_map <- c(
  "yes-general" = "General elite criticism (``General'')"
  , "yes-specific" = "Specific elite criticism (``Specific'')"
  , "yes-unsure" = "Ambiguous elite criticism (``Unsure'')"
  , "no" = "No elite criticism (``No'')"
  , "cannot-answer" = "Cannot answer (``Cannot answer'')"
)

label_map_short <- c(
  "general" = "``General''"
  , "specific" = "``Specific''"
  , "unsure" = "``Unsure''"
  , "no" = "``No''"
)

label_abbr <- c(
  "yes-general" = "``General''"
  , "yes-specific" = "``Specific''"
  , "yes-unsure" = "``Unsure''"
  , "no" = "``No''"
  , "cannot-answer" = "``Cannot answer''"
)

tmp <- filter(paper_objects$tables$sampling_eligibility_sample1, political == "yes", is_en)

## Table S4 ----

paper_objects$tables$sampling_eligibility_sample1 %>% 
  pivot_wider(names_from = "is_en", values_from = "n") %>% 
  mutate(total = round(rowSums(.[2:3])/sum(.[2:3]), 3)) %>% 
  rbind(c("\\emph{Prop.}", round(colSums(.[2:3])/sum(.[2:3]), 3), "")) %>% 
  quick_kable(
    caption = paste(
      "Tweets in our data tabulated by the availability of an English-text version and whether their content was classified to be ``political.''"
    )
    , col.names = c("Political", "no", "yes", "\\emph{Prop.}")
    , align = c("l", "r", "r", "l")
    , escape = FALSE
    , label = "sampling_eligibility_sample1"
  ) %>% 
  add_header_above(c(" " = 1, "available in English" = 2, " " = 1)) %>% 
  write_kable(.file.name = "sm-tableS04", overwrite = TRUE, dir = tables_path)

## Figure S3 ----

cap <- paste(
  "Distribution of cluster sizes of 500 $k$-means clusters."
  , "Clusters obtained from 300 independent components of tweet LASER embedding representations."
  , "\\label{fig:cluster_size_histogram}"
)

p <- paper_objects$desc$sampling$cluster_sizes %>%
  ggplot(aes(x = size)) + 
  geom_histogram(alpha = .9, fill = "black", bins = 50) + 
  labs(x = NULL, y = "Count")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS03.png"
  , device = "png"
  , dpi = 300
  , width = 5
  , height = 2
  , units = "in"
  , bg = "transparent"
)

## Figure S4 ----

cap <- paste(
  "Two-dimensional t-SNE representation of cluster centroids."
  , "Facets report results at different perplexity values."
  , "\\label{fig:centroid_tsnes}"
  , sep = ""
)

p <- paper_objects$desc$sampling$tsne_centroids %>% 
  ggplot(aes(x = d1, y = d2)) + 
  geom_point(alpha = .25, size = .5) + 
  facet_wrap(~perplexity) + 
  labs(x = NULL, y = NULL)

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS04.png"
  , device = "png"
  , dpi = 300
  , width = 5
  , height = 3
  , units = "in"
  , bg = "transparent"
)

## Figure S5 ----

cap <- paste(
  "Cluster composition in terms of country and party diversity."
  , "\\label{fig:cluster_composition}"
)

p <- paper_objects$desc$sampling$cluster_unit_sizes %>% 
  ggplot(aes(x = n_countries, group = n_countries, y = n_parties)) + 
  geom_boxplot(fill = NA, alpha = .5, show.legend = F) + 
  labs(x = "Number of distinct countries", y = "Number of distinct parties")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS05.png"
  , device = "png"
  , dpi = 300
  , width = 5.5
  , height = 2
  , units = "in"
  , bg = "transparent"
)

## Figure S6 ----

cap <- paste(
  "Relation between clusters' sizes and their internal diversity in terms of party composition."
  , "\\label{fig:cluster_size_X_diversity}"
)

p <- paper_objects$desc$sampling$cluster_unit_sizes %>% 
  ggplot(aes(cluster_size, n_parties)) +
  geom_point(alpha = .5, size = .5) + 
  geom_smooth(color = "black") +
  labs(x = "Cluster size", y = "Internal party diversity")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS06.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S7 ----

cap <- paste(
  "Number of tweets sampled from cluster (vertical axis) against cluster size in terms of number of tweets (horizontal) axis."
  , "\\label{fig:cluster_unit_sizes}"
)

determine_sample_size <- function(x) ceiling(log2(x))+1
s <- with(paper_objects$desc$sampling$cluster_unit_sizes, determine_sample_size(n_parties))

p <- paper_objects$desc$sampling$cluster_unit_sizes %>% 
  ggplot(aes(x = s, y = cluster_size, group = s)) + 
  geom_boxplot() + 
  scale_x_continuous(breaks = 1:9) +
  coord_flip() + 
  labs(
    x = "Number of tweets sampled"
    , y = "Cluster size"
  )

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS07.png"
  , device = "png"
  , dpi = 300
  , height = 2
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Table S5 ----

enframe(paper_objects$desc$coding$n_labels[[1]], name = "coding", value = "n") %>%
  left_join(enframe(label_map), by = c("coding" = "name")) %>% 
  arrange(desc(n)) %>% 
  transmute(Judgment = value, `$N$` = n, Proportion = n/sum(n)) %>% 
  quick_kable(
    caption = "Total judgment frequencies in first round of crowd coding."
    , escape = F
    , label = "judgment_proportions_sample_1"
  ) %>% 
  kable_styling(full_width = F) %>% 
  write_kable(.file.name = "sm-tableS05", overwrite = TRUE, dir = tables_path)

## Figure S8 ----

cap <- paste(
  "Distribution of judgments contributed per coder in first round of crowd coding."
  , "\\label{fig:n_judgments_distribution_sample_1}"
)

p <- paper_objects$data$coding$judgments_per_coder %>%
  filter(sample == "1") %>%
  ggplot(aes(x = n)) +   
    geom_histogram(bins = ceiling(sum(paper_objects$data$coding$judgments_per_coder$sample == "1")/4), fill = "black") + 
    scale_x_continuous(trans = "log10") +
    labs(
      x = expression("Number of judgments contributed (on "*log[10]*" scale)")
      , y = "Number of coders"
    ) 

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS08.png"
  , device = "png"
  , dpi = 300
  , height = 1.5
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S9 ----

cap <- paste(
  "Coder activity times and duration in first round of crowd coding."
  , "Each horizontal line represents the activity period and duration of a single coder."
  , "\\label{fig:coder_durations_sample_1}"
)

p <- paper_objects$desc$coding$coder_durations %>%
  filter(sample == "1") %>%
  ggplot(aes(x = reorder(worker_id, desc(start)), ymin = start, ymax = end, group = worker_id)) + 
  geom_linerange() +
  coord_flip() +
  scale_x_discrete(breaks = NULL) +
  labs(y = "Date time", x = "Coders")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS09.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S10 ----

cap <- paste(
  "Coders' numbers of judgments against judgment entropies in first round of crowd coding."
  , "Vertical and horizontal jitter of max. 1\\% added to avoid over-plotting."
  , "\\label{fig:coder_n_judgments_X_entropy_sample_1}"
  , collapse  = " "
) 

p <- paper_objects$tables$coder_stats %>%
  filter(sample == "1") %>%
  ggplot(
    aes(
      x = n_judgments
      , y = judgment_entropy
      , color = !is.na(badbadnotgood)
      , shape = !is.na(badbadnotgood)
    )
  ) +
  geom_jitter(alpha = .9, size = 1, width = .01, height = .01) +
  scale_x_continuous(trans = "log10") + 
  scale_color_manual(breaks = c(F, T), values = c("black", "red")) +
  scale_shape_manual(breaks = c(F, T), values = c(1, 3)) +
  guides(shape = "none") +
  labs(
    x =  expression("Number of judgments contributed (on "*log[10]*" scale)")
    , y = "Coder judgment entropy"
    , color = "Contributions removed"
  ) 


ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS10.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S11 ----

cap <- paste(
  "Tweet-level variability in labels assigned (entropy) in the first round of crowd coding."
  , "Panel columns group tweets by labels that were most frequently assigned to them."
  , "\\label{fig:tweet_label_entropy_sample_1}"
)

p <- paper_objects$tables$tweet_stats %>% 
  filter(sample == "1") %>%
  count(mode_label, mode_label_n, Entropy = sprintf("%0.3f", judgment_entropy)) %>% 
  group_by(mode_label) %>% 
  mutate(Proportion = n/sum(n)) %>% 
  ggplot(aes(x = Entropy , y = Proportion)) + 
    geom_bar(stat = "identity", alpha = .9) +
    coord_flip() +
    facet_wrap(~factor(mode_label, names(label_abbr), gsub("``|''", "", label_abbr)), ncol = 1) + 
    labs(
      x = "Tweet-level judgment entropy"
      , y = "Relative proportions"
    ) 

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS11.png"
  , device = "png"
  , dpi = 300
  , height = 6
  , width = 4
  , units = "in"
  , bg = "transparent"
)

## Table S6 -----

enframe(label_abbr) %>% 
  right_join(paper_objects$misc$em_fit_s1$est_class_prevl[1:4], by = c("name" = "coding")) %>% 
  select(-1) %>% 
  quick_kable(
    caption = "Model-based label prevalence estimates and label proportions due to model-based aggregation respectively plurality voting (PV) in first sample of crowd codings."
    , col.names = c("Label", "Est. prevalence", "model", "PV")
    , escape = FALSE
    , label = "est_label_prevalence_s1"
  ) %>% 
  add_header_above(c(" " = 2, "Label proportions" = 2)) %>% 
  write_kable(.file.name = "sm-tableS06", overwrite = TRUE, dir = tables_path)

## Table S7 ----

labelings_s1 <- AnnotationModelsR::get_labeling(paper_objects$misc$em_fit_s1)

labeling_comp <- enframe(label_abbr) %>% 
  right_join(
    count(labelings_s1, labeling, majority_vote) %>% 
      pivot_wider(names_from = "majority_vote", values_from = "n")
    , by = c("name" = "labeling")
  ) %>% 
  .[, c("value", names(label_abbr[-5]))] %>% 
  mutate_all(replace_na, 0L)

labeling_comp %>% 
  quick_kable(
    caption = "Comparing model-based and plurality winner labelings in the first sample of crowd codings."
    , col.names = c("Model-based labeling", label_abbr[-5])
    , label = "intermethod_aggreement_s1"
  ) %>%
  add_header_above(c(" " = 1, "Plurality winner label" = 4)) %>% 
  write_kable(.file.name = "sm-tableS07", overwrite = TRUE, dir = tables_path)

## Figure S12 ----

cap <- paste(
  "Estimates of coders' true label detection abilities in the first sample of crowd codings."
  , "\\label{fig:ability_estimates_s1}"
)

p <- paper_objects$misc$em_fit_s1$est_annotator_params %>% 
  filter(coding == labeled) %>% 
  ggplot(aes(est_prob)) + 
  geom_density(alpha = .55, fill = "lightgrey", color = "darkgrey") +
  scale_x_continuous(breaks = seq(.25, 1, .25), limit = 0:1) +
  facet_grid(cols = vars(factor(coding, names(label_abbr), gsub("^``|''$", "", label_abbr)))) +
  labs(x = NULL, y = "Density")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS12.png"
  , device = "png"
  , dpi = 300
  , height = 1.75
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

##  Table S8 ----

paper_objects$misc$em_fit_s1$est_annotator_params %>% 
  filter(coding == labeled) %>% 
  mutate(tmp = factor(coding, names(label_abbr), label_abbr)) %>% 
  group_by(`Label class` = tmp) %>% 
  summarise(
    Mean = mean(est_prob, na.rm = TRUE)
    , `Std. Dev.` = sd(est_prob)
    , Skewness = e1071::skewness(est_prob)
    , `10%` = quantile(est_prob, .1)
    , `25%` = quantile(est_prob, .25)
    , Median = median(est_prob, na.rm = TRUE)
    , `75%` = quantile(est_prob, .75)
    , `90%` = quantile(est_prob, .99)
  ) %>% 
  ungroup() %>% 
  arrange_at(1) %>% 
  quick_kable(
    caption = "Summary statistics of  coders' true label detection abilities in the first sample of crowd codings."
    , label = "ability_estimates_s1"
  ) %>% 
  add_header_above(c(" " = 1, "Moments" = 3, "Quantiles" = 5)) %>% 
  write_kable(.file.name = "sm-tableS08", overwrite = TRUE, dir = tables_path)

## Table S9 ----

labeled_tweets <- filter(paper_objects$data$labelings$labeled_tweets_s1, labeling != "cannot-answer")

labeling_counts <- table(labeled_tweets$labeling)

enframe(label_abbr) %>% 
  left_join(count(labeled_tweets, labeling), by = c("name" = "labeling")) %>% 
  select(2, 1, 3) %>% 
  mutate(cumsum = cumsum(n)) %>% 
  quick_kable(
    caption = "Numbers of tweets by label class."
    , col.names = c("Label", "Code", "$N$", "Cum. sum")
    , escape = FALSE
    , label = "labeling_distribution_sample_1"
  ) %>% 
  write_kable(.file.name = "sm-tableS09", overwrite = TRUE, dir = tables_path)

## Table S10 -----

enframe(label_abbr) %>% 
  left_join(count(paper_objects$data$labelings$training_data_s1$training, labeling), by = c("name" = "labeling")) %>% 
  left_join(count(paper_objects$data$labelings$training_data_s1$test, labeling), by = c("name" = "labeling")) %>% 
  select(-1) %>% 
  quick_kable(
    caption = "Numbers of tweets by label class in the training data."
    , col.names = c("Label", "$N_{\\text{train}}$", "$N_{\\text{val}}$")
    , escape = FALSE
    , label = "train_val_n_sample_1"
  ) %>% 
  write_kable(.file.name = "sm-tableS10", overwrite = TRUE, dir = tables_path)

## Table S11 ----

these_metrics <- c("Balanced Accuracy", "F1", "Precision", "Sensitivity", "Specificity")

best_glmnet <- paper_objects$misc$glmnet_s1_grid_search_results[which.max(paper_objects$misc$glmnet_s1_grid_search_results$Mean_BalancedAccuracy), 1:2]

glmnet_grid_search_res <- paper_objects$misc$glmnet_s1_grid_search_results %>% 
  pivot_longer(-c(1:2)) %>% 
  filter(grepl(paste(gsub(" ", "", these_metrics), collapse = "|"), name)) %>%
  mutate(
    is_sd = factor(grepl("SD$", name), c(T, F), c("sd", "mean"))
    , name = sub("SD$", "", name)
  ) %>% 
  pivot_wider(names_from = "is_sd", values_from = "value") %>% 
  separate(name, c("metric", "class"), "__", fill = "right") %>% 
  separate(metric, c("stat", "metric"), "_", fill = "left") %>% 
  mutate(
    what = ifelse(is.na(class), stat, class)
    , metric = gsub("(?<=\\w)(?=[A-Z])", " ", metric, perl = TRUE)
  ) %>% 
  select(-stat, -class)

tmp <- these_metrics
tmp[tmp == "Sensitivity"] <- "Recall"

glmnet_grid_search_res %>% 
  filter(alpha == best_glmnet$alpha) %>% 
  filter(lambda == best_glmnet$lambda) %>% 
  filter(what != "WMean") %>% 
  pivot_wider(names_from = "metric", values_from = c("mean", "sd")) %>% 
  select(what, !!paste(rep(c("mean", "sd"), length(these_metrics)), rep(these_metrics, each = 2), sep = "_")) %>% 
  mutate(what = ifelse(what == "Mean", what, label_map_short[what])) %>% 
  quick_kable(
    caption = "Mean cross-validation performance of best GLM-Net model in training split of first sample of labeled tweets."
    , col.names = c(" ", rep(c("Mean", "SD"), length(tmp)))
    , escape = FALSE
    , label = "mean_cv_performance_best_glmnet_sample_1"
  ) %>% 
  add_header_above(c(" " = 1, setNames(rep(2, length(tmp)), tmp))) %>% 
  write_kable(.file.name = "sm-tableS11", overwrite = TRUE, dir = tables_path)

## Tabls S12 ----

enframe(paper_objects$desc$coding$n_labels[[2]], name = "coding", value = "n") %>%
  left_join(enframe(label_map), by = c("coding" = "name")) %>% 
  arrange(desc(n)) %>% 
  transmute(
    Judgment = str_replace_na(value, "\\texttt{invalid}")
    , `$N$` = n
    , Proportion = n/sum(n)
  ) %>% 
  quick_kable(
    caption = "Total judgment frequencies in second round of crowd coding."
    , escape = F
    , label = "judgment_proportions_sample_2"
  ) %>% 
  kable_styling(full_width = F) %>% 
  write_kable(.file.name = "sm-tableS12", overwrite = TRUE, dir = tables_path)

## Figure 13 ----

cap <- paste(
  "Distribution of judgments contributed per coder in second round of crowd coding."
  , "\\label{fig:n_judgments_distribution_sample_2}"
)

p <- paper_objects$data$coding$judgments_per_coder %>%
  filter(sample == "2") %>%
  ggplot(aes(x = n)) +   
  geom_histogram(bins = ceiling(sum(paper_objects$data$coding$judgments_per_coder$sample == 2)/4), fill = "black") + 
  scale_x_continuous(trans = "log10") +
  labs(
    x = expression("Number of judgments contributed (on "*log[10]*" scale)")
    , y = "Number of coders"
  ) 

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS13.png"
  , device = "png"
  , dpi = 300
  , height = 1.75
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S14 ----

cap <- paste(
  "Coder activity times and duration in the second round of crowd coding."
  , "Each horizontal line represents the activity period and duration of a single coder."
  , "\\label{fig:coder_durations_sample_2}"
)

p <- paper_objects$desc$coding$coder_durations %>%
  filter(sample == "2") %>%
  ggplot(aes(x = reorder(worker_id, desc(start)), ymin = start, ymax = end, group = worker_id)) + 
  geom_linerange() +
  coord_flip() +
  scale_x_discrete(breaks = NULL) +
  labs(y = "Date time", x = "Coders")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS14.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S15 ----

cap <- paste(
  "Coders' numbers of judgments against judgment entropies in second round of crowd coding."
  , "Vertical and horizontal jitter of max. 1\\% added to avoid over-plotting."
  , "\\label{fig:coder_n_judgments_X_entropy_sample_2}"
) 

p <- ggplot(
  filter(paper_objects$tables$coder_stats, sample == "2")
  , aes(
    x = n_judgments
    , y = judgment_entropy
    , color = !is.na(badbadnotgood)
    , shape = !is.na(badbadnotgood)
  )
) +
  geom_jitter(alpha = .9, size = 1, width = .01, height = .01) +
  scale_x_continuous(trans = "log10") + 
  scale_color_manual(breaks = c(F, T), values = c("black", "red")) +
  scale_shape_manual(breaks = c(F, T), values = c(1, 3)) +
  guides(shape = "none") +
  labs(
    x =  expression("Number of judgments contributed (on "*log[10]*" scale)")
    , y = "Coder judgment entropy"
    , color = "Contributions removed"
  ) 

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS15.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S16 ----

cap <- paste(
  "Tweet-level variability in labels assigned (entropy) in the second round of crowd coding."
  , "Panel columns group tweets by labels that were most frequently assigned to them."
  , "\\label{fig:tweet_label_entropy_sample_2}"
)

p <- paper_objects$tables$tweet_stats %>%
  filter(sample == "2") %>%
  count(mode_label, mode_label_n, Entropy = sprintf("%0.3f", judgment_entropy)) %>% 
  group_by(mode_label) %>% 
  mutate(Proportion = n/sum(n)) %>% 
  ggplot(aes(x = Entropy , y = Proportion)) + 
  geom_bar(stat = "identity", alpha = .9) +
  coord_flip() +
  facet_wrap(~factor(mode_label, names(label_abbr), gsub("``|''", "", label_abbr)), ncol = 1) + 
  labs(
    x = "Tweet-level judgment entropy"
    , y = "Relative proportions"
  ) 

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS16.png"
  , device = "png"
  , dpi = 300
  , height = 6
  , width = 4
  , units = "in"
  , bg = "transparent"
)

## Table S13 ----

paper_objects$tables$sample_descriptives %>%
  mutate_at(vars(per_unit_pay), ~sub("$", "\\$ ", ., fixed = T)) %>% 
  quick_kable(
    caption = paste(
      "Sample descriptives and crowd-coding statistics."
      , "Rows report values for samples 1 and 2 collected in the first and second iteration of our selective sampling strategy."
      , "Column two reports the total number of tweets distributed for coding in each round sample." 
      , "Columns grouped under the header ``Crowd-coding statistics'' report"
      , "the number of codings collected per tweet," 
      , "the total number of codings collected"
      , "the total number of coders that have contributed to coding"
      , "and the per-task pay."
      , "Columns grouped under the header ``Median coding duration'' report summary statistics of the coder-level median times spent for coding a tweet."
    )
    , label = "coding_sample_descriptives"
    , col.names = c(
      "Sample", "Tweets"
      , "per tweet", "Codings", "Coders", "Task pay"
      , "Median", "Mean", "SD", "Skew"
    )
    , escape = FALSE
    , align = "r"
  ) %>% 
  add_header_above(
    c(
      " " = 2
      , "Crowd-coding statistics" = 4
      , "Coder: Median coding duration" = 4
    )
    , italic = TRUE
  ) %>% 
  write_kable(.file.name = "sm-tableS13", overwrite = TRUE, dir = tables_path)

## Table S14 ----

paper_objects$tables$judgment_data_cleaning %>%
  arrange(sample) %>% 
  select(-1) %>% 
  rename(
    `\\textbf{Removal steps}` = step
    , `\\texttt{NA}` = `NA`
    , `\\emph{Total}` = total
  ) %>% 
  mutate(across(where(is.numeric), replace_na, replace = 0)) %>% 
  quick_kable(
    caption = paste(
      "Number of codings across categories retained at subsequent data cleaning steps by sample."
    )
    , label = "judgment_data_cleaning"
    , escape = FALSE
    , align = c("l", rep("r", 7))
  ) %>% 
  group_rows("Sample 1", 1, 5, italic = TRUE) %>% 
  group_rows("Sample 2", 6, 10, italic = TRUE, hline_before = TRUE) %>% 
  column_spec(1) %>% 
  column_spec(2:4, width = "0.47in") %>%
  column_spec(5, width = "0.4in") %>%
  column_spec(6, width = "0.5in") %>%
  column_spec(7, width = "0.1in") %>% 
  column_spec(8, width = "0.3in") %>% 
  add_header_above(c(" " = 1, "Coding categories" = 5, " " = 2), bold = TRUE) %>% 
  write_kable(.file.name = "sm-tableS14", overwrite = TRUE, dir = tables_path)

## Figure S17 ----

cap <- paste(
  "Number of judgments against coder judgment entropy by sample."
  , "Vertical and horizontal jitter of max. 1\\% added to avoid over-plotting."
  , "Removed (retained) coders are shown as red crosses (hollow black circles)."
  , "Note that values on the horizontal axis are reported on a $\\log_{10}$-scale."
  , "\\label{fig:n_judgments_X_entropy_both_samples}"
)

tmp <- paper_objects$tables$coder_stats %>% 
  group_by(sample, n_judgments, judgment_entropy) %>% 
  mutate(n_ = n()) %>% 
  ungroup()

p <- tmp %>% 
  ggplot(
    aes(
      n_judgments
      , judgment_entropy
      , color = factor(!is.na(badbadnotgood), c(F, T), c("no", "yes"))
      , shape = !is.na(badbadnotgood)
    )
  ) +
  geom_jitter(data = filter(tmp, n_ > 1), alpha = .75, size = 1, width = .01, height = .01) +
  geom_point(data = filter(tmp, n_ == 1), alpha = .75, size = 1) +
  scale_x_log10() + 
  scale_color_manual(breaks =  c("no", "yes"), values = c("black", "red")) +
  scale_shape_manual(breaks = c(F, T), values = c(1, 3)) +
  guides(shape = "none") + 
  facet_grid(cols = vars(paste("Sample", sample))) +
  labs(
    title = NULL
    , subtitle = NULL
    , x = expression("Number of judgments contributed (on "*log[10]*" scale)")
    , y = "Coder judgment entropy"
    , color = "Coder removed:"
  )

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS17.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Table S15 ----

enframe(label_abbr) %>% 
  right_join(
    select(paper_objects$misc$em_fit_pooled$est_class_prevl, -prop_mv, -prop_annotations)
    , by = c("name" = "coding")
  ) %>% 
  right_join(
    count(paper_objects$misc$em_fit_pooled$est_class_probs, labeling)
    , by = c("name" = "labeling")
  ) %>% 
  select(-1) %>% 
  quick_kable(
    caption = "Model-based label prevalence estimates and label proportions and counts due to model-based aggregation from Dawid--Skene model fitted to the pooled codings in samples 1 and 2 retained after data cleaning."
    , label = "em_fit_prevalence_pooled"
    , col.names = c("\\textbf{Label class}", "Est. prevalence", "Proportion", "$N$")
    , escape = FALSE
  ) %>% 
  add_header_above(c(" " = 2, "Induced labels" = 2), bold = TRUE) %>% 
  write_kable(.file.name = "sm-tableS15", overwrite = TRUE, dir = tables_path)

## Figure S18 ----

cap <- paste(
  "Distribution of estimated coder label class detection ability parameters"
  , "obtained by fitting a Dawid--Skene model to the pooled codings in samples 1 and 2 retained after data cleaning."
  , "\\label{fig:ability_estimates_pooled}"
)

p <- paper_objects$misc$em_fit_pooled$est_annotator_params %>% 
  filter(coding == labeled) %>% 
  ggplot(aes(est_prob)) + 
  geom_density(alpha = .55, fill = "lightgrey", color = "darkgrey") +
  scale_x_continuous(breaks = seq(0.25, 1, .25), limit = 0:1) +
  facet_grid(
    cols = vars(
      factor(
        coding
        , names(label_abbr)[-5]
        , gsub("``|''", "", label_abbr[-5])
      )
    )
    , scales = "free"
  ) +
  labs(x = NULL, y = "Density")

ggsave(
  plot = p
  , path = fig_path
  , filename = "sm-figureS18.png"
  , device = "png"
  , dpi = 300
  , height = 1.75
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Table S16 ----

anno_params_sum <- paper_objects$misc$em_fit_pooled$est_annotator_params %>% 
  filter(coding == labeled) %>% 
  group_by(` ` = coding) %>% 
  summarise(
    Mean = mean(est_prob, na.rm = TRUE)
    , `Std. Dev.` = sd(est_prob)
    , Skewness = e1071::skewness(est_prob)
    , `10\\%` = quantile(est_prob, .1)
    , `25\\%` = quantile(est_prob, .25)
    , Median = median(est_prob, na.rm = TRUE)
    , `75\\%` = quantile(est_prob, .75)
    , `90\\%` = quantile(est_prob, .99)
    , `\\% > chance` = mean(est_prob > .25)
  ) %>% 
  ungroup() %>% 
  mutate_at(1, factor, names(label_abbr)[-5], label_abbr[-5]) %>% 
  arrange_at(1)

anno_params_sum %>% 
  quick_kable(
    caption = 'Summary statistics of coder label detection ability estimates of the Dawid--Skene model fitted to the pooled and cleaned codings.'
    , label = "anno_params_sum_pooled"
    , escape = FALSE
  ) %>% 
  add_header_above(c(" " = 1, "Moments" = 3, "Quantiles" = 5, " " = 1)) %>%
  write_kable(.file.name = "sm-tableS16", overwrite = TRUE, dir = tables_path)


# SECTION D ----

## Table S17 ----

approach_map <- paper_objects$desc$classifiers$approach_map

models_map <- paper_objects$desc$classifiers$models_map
models_map["transformer"] <- "XLM-Twitter"

tmp <- paper_objects$data$classifiers_performances_binary_wide %>% 
  mutate(
    approach_str = factor(approach, approach_map, names(approach_map))
    , learner = ifelse(is.na(learner), "transformer", learner)
  ) %>% 
  filter(learner != "kmlp") %>% 
  mutate(
    learner_str = as.character(factor(learner, names(models_map), models_map))
    , learner_str = ifelse(approach == "mse", paste(sub(" embeddings", "", features), learner_str, sep = " + "), learner_str)
  )

tab <- tmp %>% 
  filter(grepl("avg", what)) %>%
  transmute(features, approach_str, learner_str, f1 = `f1-score`, what = sub(" avg", "", what)) %>%
  pivot_wider(names_from = "what", values_from = "f1") %>%
  arrange(desc(features), desc(macro)) %>%
  select(-features) %>% 
  left_join(
    tmp %>%
      filter(what == "pos") %>%
      select(approach_str, learner_str, precision, recall)
  ) %>%
  left_join(
    tmp %>%
      filter(what == "neg") %>%
      select(approach_str, learner_str, specificity = recall)
  ) %>% 
  select(approach_str, learner_str, macro, micro, precision, recall, specificity)

tab %>% 
  quick_kable(
    caption = paste(
      "Performance of models trained with different approaches and alorithms.",
      "Models sorted by macro F1 score within approach."
    )
    , col.names = c("Approach", "Model", "$F1_{\\mbox{macro}}$", "$F1_{\\mbox{micro}}$", "Precision", "Recall", "Specificity")
    , align = c(rep("l", 2), rep("c", 5))
    , label = "performances"
    , escape = FALSE
  ) %>% 
  collapse_rows(
    1:2
    , row_group_label_position = "stack"
    , latex_hline = "none"
    , valign = "top"
  ) %>% 
  write_kable(.file.name = "sm-tableS17", overwrite = TRUE, dir = tables_path)


## Figure S19 ----

cap <- paste(
  "Text features most distinctive among tweets labeled as general elite criticism instances by our XLM-T classifier in the five English-speaking countries in our sample."
  , "Plot panels list the 20 terms per country that are most distinctive among tweets predicted to contain general elite criticism."
  , "Color shading indicates $z$-scores (lighter colors indicate higher scores, i.e., higher distinctiveness)."
  , "$z$-scores obtained by applying the feature extraction method proposed by \\citet{monroe_fightin_2008}."
  , "\\label{fig:fighting_words_en}"
)

ggsave(
  plot = paper_objects$figures$validation$fighting_words_en
  , path = fig_path
  , filename = "sm-figureS19.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

## Figure S20 ----

cap <- paste(
  "Text features most distinctive among tweets labeled as general elite criticism instances by our XLM-T classifier in the German-speaking countries in our sample."
  , "Plot panels list the 20 terms per country that are most distinctive among tweets predicted to contain general elite criticism."
  , "Color shading indicates $z$-scores (lighter colors indicate higher scores, i.e., higher distinctiveness)."
  , "$z$-scores obtained by applying the feature extraction method proposed by \\citet{monroe_fightin_2008}."
  , "\\emph{Note:} Results for Luxembourg omitted because of low number of anti-elite tweets."
  , "\\label{fig:fighting_words_de}"
)

ggsave(
  plot = paper_objects$figures$validation$fighting_words_de
  , path = fig_path
  , filename = "sm-figureS20.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)


## Figure S21 ----

cap <- paste(
  "Text features most distinctive among tweets labeled as general elite criticism instances by our XLM-T classifier in Spain."
  , "Plot panels list the 20 terms per language that are most distinctive among tweets predicted to contain general elite criticism."
  , "Color shading indicates $z$-scores (lighter colors indicate higher scores, i.e., higher distinctiveness)."
  , "$z$-scores obtained by applying the feature extraction method proposed by \\citet{monroe_fightin_2008}."
  , "\\emph{Note:} Results for tweets written in Portuguese omitted due to the small sample size."
  , "\\label{fig:fighting_words_esp}"
)

ggsave(
  plot = paper_objects$figures$validation$fighting_words_esp
  , path = fig_path
  , filename = "sm-figureS21.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 5.5
  , units = "in"
  , bg = "transparent"
)

# SECTION E ----

## Table S18 ----

paper_objects$tables$ches_correlations_detailed %>% 
  pivot_wider(names_from = "quarters", values_from = "r") %>% 
  quick_kable(
    caption = paste(
      "Correlations of CHES anti-elite salience estimates with our anti-elite strategy estimates computed by aggregating varying numbers of tweets prior to CHES waves' field end dates."
      , "Our estimates of anti-elite strategies were obtained by aggregating parties' tweets in the 8, 6, 4, respectively 2 quarters prior to the field end data of a given CHES wave."
      , "Parties with less than 100 tweets in these date ranges were omitted."
      , sep = " "
    )
    , col.names = c("Wave", seq(8, 2, -2))
    , align = "r"
    , label = "ches_correlations_detailed" 
  ) %>% 
  add_header_above(c(" " = 1, "Prior quarters included" = 4)) %>% 
  column_spec(1, italic = TRUE) %>% 
  write_kable(.file.name = "sm-tableS18", overwrite = TRUE, dir = tables_path)

## Table S19 -----

paper_objects$tables$dictionaries_overview %>% 
  quick_kable(
    caption = paste(
      "Overview of existing populism dictionaries (with or without separate list of anti-elitism keywords)."
      , "\\emph{Note:} Based on \\citet{grundl_populist_2020}."
    )
    , col.names = c("Paper", "Description", "")
    , label = "dictionaries_overview"
    , escape = FALSE
  ) %>% 
  collapse_rows(1:2, valign = "t", latex_hline = "major") %>% 
  write_kable(.file.name = "sm-tableS19", overwrite = TRUE, dir = tables_path, .write.data = FALSE)

## Table S20 ----

paper_objects$tables$dictionary_comparison %>% 
  quick_kable(
    caption = paste(
      "Correlations of CHES anti-elite salience estimates with measurements created with our XLM-T classifier, the dictionary compiled by \\citet{grundl_populist_2020}, and the dictionary compiled by \\citet{rooduijn_measuring_2011}, respectively."
      , "Estimates computed on German-language tweets in Austrian, German, and Swiss parties tweets published in the 12 months prior to the respective CHES waves' field end date. "
      , sep = " "
    )
    , align = "c"
    , label = "dictionary_comparison" 
  ) %>% 
  add_header_above(c(" " = 2, "Dictionary" = 2)) %>% 
  write_kable(.file.name = "sm-tableS20", overwrite = TRUE, dir = tables_path)

## Figure S22 ----

cap <- paste(
  "Correlations of CHES anti-elite salience estimates with measurements created with our XLM-T classifier, the dictionary compiled by \\citet{grundl_populist_2020}, and the dictionary compiled by \\citet{rooduijn_measuring_2011}, respecitvely,"
  , "depending on how many quarters we have prior to CHES waves' field end dates we have included when computing party-level estimates."
  , "Estimates computed on German-language tweets in Austrian, German, and Swiss parties tweets."
  , "\\label{fig:dictionary_comparison_by_quarters}"
  , sep = " "
)

ggsave(
  plot = paper_objects$figures$validation$dictionary_comparison_by_quarters
  , path = fig_path
  , filename = "sm-figureS22.png"
  , device = "png"
  , dpi = 300
  , height = 5
  , width = 6
  , units = "in"
  , bg = "transparent"
)

# SECTION F ----

## Figure S23 ----

cap <- paste(
  "Distribution of quarterly polls indicators by country."
  , "\\label{fig:distr_polls}"
  , collapse = " "
)

ggsave(
  plot = paper_objects$figures$analyses$polls_distribution_by_country
  , path = fig_path
  , filename = "sm-figureS23.png"
  , device = "png"
  , dpi = 300
  , height = 3.6
  , width = 4.5
  , units = "in"
  , bg = "transparent"
)

## Table S21 ----

bind_rows(
  "Polls" = paper_objects$data$polls$n_obs_by_party
  , "CIP" = paper_objects$data$cip$n_obs_by_party
  , .id = "what"
) %>% 
  group_by(country_iso3c, what) %>% 
  summarise(tmp = list(summary(n_obs))) %>% 
  pivot_longer(tmp, names_to = NULL) %>% 
  unnest_wider(value) %>% 
  select(-Mean) %>% 
  rename(
    Country = country_iso3c
    , Data = what
  ) %>% 
  mutate(across(where(is.numeric), round, 0)) %>% 
  quick_kable(
    caption = "Distribution of number of party--quarter units for which polls data respectively coalition inclusion probability (CIP) estimates are available by country."
    , label = "n_obs_stats"
  ) %>% 
  add_header_above(c(" " = 2, "Quarters per party" = 5)) %>% 
  collapse_rows(1, valign = "top", latex_hline = "none") %>% 
  write_kable(.file.name = "sm-tableS22", overwrite = TRUE, dir = tables_path, .write.data = FALSE)


## Figure S4 ----

cap <- paste(
  "Distribution of quarterly averages of coalition inclusion probability indicators by country."
  , "\\label{fig:distr_cips}"
  , collapse = " "
)

ggsave(
  plot = paper_objects$figures$analyses$cip_distribution_by_country
  , path = fig_path
  , filename = "sm-figureS24.png"
  , device = "png"
  , dpi = 300
  , height = 3
  , width = 4.5
  , units = "in"
  , bg = "transparent"
)

## Table S22 ----

tmp <- paper_objects$misc$regressions$elitecriticism_polls
names(tmp) <- str_to_title(names(tmp))

texreg(
  l = tmp
  , file = file.path(tables_path, "sm-tableS22.tex")
  , vcov = function(x) vcovBK(x, cluster="time") # passed on to texreg:::extract.plm, and then to plm:::summary.plm
  , label = "tab:polls_regression"
  , stars = c(0.001, 0.01, 0.05)
  , custom.coef.map = list(
    "lag(mean_prob_elitecriticism)" = "Lagged DV: Anti-elite strategy ($t-1$)"
    , "lag(spolls_mean)" = "Polling average ($t-1$)"
  )
  , custom.gof.rows = list("Party FEs" = c("Yes", "Yes"))
  , leading.zero = TRUE
  , center = TRUE
  , digits = 3
  , sideways = FALSE
  , dcolumn = TRUE
  , caption = paste(
    "Regression of parties' anti-elite strategies on their average polling results at $t-1$."
    , "All variables recorded at the party–quarter level."
    , "Models include the lag of the anti-elite strategy indicator and party fixed effects."
    , "Regression coefficients estimated using OLS with panel-corrected standard errors."
    , collapse = " "
  )
  , booktabs = TRUE
  , use.packages = FALSE
) 

## Table S23 ----

tmp <- paper_objects$misc$regressions$elitecriticism_cip
names(tmp) <- str_to_title(names(tmp))

texreg(
  l = tmp
  , file = file.path(tables_path, "sm-tableS23.tex")
  , vcov = function(x) vcovBK(x, cluster="time") # passed on to texreg:::extract.plm, and then to plm:::summary.plm
  , label = "tab:cip_regression"
  , stars = c(0.001, 0.01, 0.05)
  , custom.coef.map = list(
    "lag(mean_prob_elitecriticism)" = "Lagged DV: Anti-elite strategy ($t-1$)"
    , "lag(cip_mean)" = "Coalition Inclusion Probability ($t-1$)"
  )
  , custom.gof.rows = list("Party FEs" = c("Yes", "Yes"))
  , leading.zero = TRUE
  , center = TRUE
  , digits = 3
  , sideways = FALSE
  , dcolumn = TRUE
  , caption = paste(
    "Regression of parties' anti-elite strategies on their coalition inclusion probability at $t-1$."
    , "All variables recorded at the party–quarter level."
    , "Models include the lag of the anti-elite strategy indicator and party fixed effects."
    , "Regression coefficients estimated using OLS with panel-corrected standard errors."
    , collapse = " "
  )
  , booktabs = TRUE
  , use.packages = FALSE
)



